#1. Histogram of ratings
library(tidyverse)
library(here)
chocolate <- readRDS(here("data", "chocolate.RDS"))
ggplot(chocolate, aes(x=rating)) + geom_histogram(bins=15) + ggtitle("Histogram of chocolate bar ratings")Increasing the number of bins allows more granularity of data, better showing the shape of the distribution. However, if we increase the number of bins too much we start to have gaps in our data because the number of bins exceeds the number of potential scores. I selected a 15 bin histogram because it nicely shows the shape of the distribution but doesn’t have the gapping a 20+ bin histogram does.
#2. Number of ratings by country of bean origin
library(dplyr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
count <-count(chocolate, country_of_bean_origin, wt=NULL, sort=TRUE)
print(count, n=62)## # A tibble: 62 × 2
## country_of_bean_origin n
## <chr> <int>
## 1 Venezuela 253
## 2 Peru 244
## 3 Dominican Republic 226
## 4 Ecuador 219
## 5 Madagascar 177
## 6 Blend 156
## 7 Nicaragua 100
## 8 Bolivia 80
## 9 Colombia 79
## 10 Tanzania 79
## 11 Brazil 78
## 12 Belize 76
## 13 Vietnam 73
## 14 Guatemala 62
## 15 Mexico 55
## 16 Papua New Guinea 50
## 17 Costa Rica 43
## 18 Trinidad 42
## 19 Ghana 41
## 20 India 35
## 21 U.S.A. 33
## 22 Haiti 30
## 23 Honduras 25
## 24 Jamaica 24
## 25 Philippines 24
## 26 Indonesia 20
## 27 Grenada 19
## 28 Uganda 19
## 29 Fiji 16
## 30 Sao Tome 14
## 31 Vanuatu 13
## 32 Cuba 12
## 33 Congo 11
## 34 Solomon Islands 10
## 35 St. Lucia 10
## 36 Panama 9
## 37 Malaysia 8
## 38 Ivory Coast 7
## 39 Puerto Rico 7
## 40 El Salvador 6
## 41 Thailand 5
## 42 Sierra Leone 4
## 43 Australia 3
## 44 Cameroon 3
## 45 Liberia 3
## 46 Nigeria 3
## 47 Samoa 3
## 48 Togo 3
## 49 Sao Tome & Principe 2
## 50 Sri Lanka 2
## 51 Taiwan 2
## 52 Tobago 2
## 53 Burma 1
## 54 China 1
## 55 DR Congo 1
## 56 Gabon 1
## 57 Martinique 1
## 58 Principe 1
## 59 St.Vincent-Grenadines 1
## 60 Sulawesi 1
## 61 Sumatra 1
## 62 Suriname 1
The number of ratings by each country of origin is described in the table above
#3. Average ratings for Ecuadorian beans
library(dplyr)
library(knitr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
ecud <- filter(chocolate,country_of_bean_origin == "Ecuador")
ecud <- mutate(ecud, mean=mean(rating, na.rm = TRUE))
ecud <- mutate(ecud, sd=sd(rating, na.rm = TRUE))
ecud <- mutate(ecud, present=case_when(!is.na(rating) ~ 1 , is.na(rating) ~ 0 ))
ecud <- mutate(ecud, total=sum(present))
ecud <- select(ecud, (mean:total))
ecud <- select(ecud, -(present))
ecud <- filter(ecud, !row_number()!= 1 )
knitr::kable(head(ecud[, 1:3]), "pipe")| mean | sd | total |
|---|---|---|
| 3.164384 | 0.5122678 | 219 |
Average ratings, standard deviation of ratings and total number of ratings for Ecuadorian chocolates
#4.Best manufacturer
library(dplyr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
ecud <- filter(chocolate,country_of_bean_origin == "Ecuador")
ecud <- mutate(ecud, mean=mean(rating, na.rm = TRUE))
ecud <- mutate(ecud, sd=sd(rating, na.rm = TRUE))
ecud <- mutate(ecud, present=case_when(!is.na(rating) ~ 1 , is.na(rating) ~ 0 ))
ecud <- mutate(ecud, total=sum(present))
company <- group_by(ecud, company_manufacturer)
summarize(company, rating=mean(rating, na.rm = TRUE))## # A tibble: 136 × 2
## company_manufacturer rating
## <chr> <dbl>
## 1 A. Morin 3.75
## 2 Aequare (Gianduja) 2.88
## 3 Alexandre 3.5
## 4 Altus aka Cao Artisan 2.75
## 5 Amano 4
## 6 Amatller (Simon Coll) 2.75
## 7 Amedei 3
## 8 Ara 2.75
## 9 Arete 3.44
## 10 Askinosie 3
## # ℹ 126 more rows
Of the Ecuadorian bean chococlates, A. Morin is the manufacturer with the highest average rating.
#5.Average rating by country of origin
chocolate <- readRDS(here("data", "chocolate.RDS"))
country <- group_by(chocolate, country_of_bean_origin)
country <- summarize(country, rating=mean(rating, na.rm = TRUE))
country <- arrange(country, desc(rating))
knitr::kable(head(country[, 1:2]), "pipe")| country_of_bean_origin | rating |
|---|---|
| Tobago | 3.625000 |
| China | 3.500000 |
| Sao Tome & Principe | 3.500000 |
| Solomon Islands | 3.450000 |
| Congo | 3.318182 |
| Thailand | 3.300000 |
Tobago (3.63), China (3.50), and Sao Tome & Principe (3.50) are the countries of origin that have the highest average ratings.
#6.Average rating by country of origin in countries with 10+ ratings
library(dplyr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
country <- group_by(chocolate, country_of_bean_origin)
country_n <- mutate(country, n=n())
country_n <- filter(country_n, n>=10)
country_n <- summarize(country_n, rating=mean(rating, na.rm = TRUE))
country_n <- arrange(country_n, desc(rating))
knitr::kable(head(country_n[, 1:2]), "pipe")| country_of_bean_origin | rating |
|---|---|
| Solomon Islands | 3.450000 |
| Congo | 3.318182 |
| Cuba | 3.291667 |
| Vietnam | 3.287671 |
| Papua New Guinea | 3.280000 |
| Madagascar | 3.266949 |
Solomon Islands (3.45), Congo (3.32), and Cuba (3.29) are the countries of origin that have the highest average rating after filtering out countries with fewer than 10 chococlate bar reviews.
#7.
library(dplyr)
library(forcats)
chocolate <- readRDS(here("data", "chocolate.RDS"))
country <- group_by(chocolate, country_of_bean_origin)
country <- mutate(country, n=n())
country <- filter(country, n>=50)
country <- mutate(country, c_percent = str_remove(cocoa_percent, "%"))
country$c_percent <-as.numeric(country$c_percent)
country <- mutate(country, percent = case_when(c_percent < 60 ~ "1", c_percent >= 60 & c_percent < 70 ~ "2", c_percent >= 70 & c_percent < 90 ~ "3", c_percent >= 90 ~ "4"))
country <- country %>% mutate(percent = fct_relevel(percent, c("2", "3", "4"), after=1))
#Belize
Belize <- filter(country, country_of_bean_origin=="Belize")
qplot(percent, rating, data=Belize, geom="boxplot") + ggtitle("Belize")#Blend
Blend <- filter(country, country_of_bean_origin=="Blend")
qplot(percent, rating, data=Blend, geom="boxplot") + ggtitle("Blend")#Bolivia
Bolivia <- filter(country, country_of_bean_origin=="Bolivia")
qplot(percent, rating, data=Bolivia, geom="boxplot") + ggtitle("Bolivia")#Brazil
Brazil <- filter(country, country_of_bean_origin=="Brazil")
qplot(percent, rating, data=Brazil, geom="boxplot") + ggtitle("Brazil") #Colombia
Colombia <- filter(country, country_of_bean_origin=="Colombia")
qplot(percent, rating, data=Colombia, geom="boxplot") + ggtitle("Colombia") #Dominican Republic
DR <- filter(country, country_of_bean_origin=="Dominican Republic")
qplot(percent, rating, data=DR, geom="boxplot") + ggtitle("Dominican Republic")#Ecuador
Ecuador <- filter(country, country_of_bean_origin=="Ecuador")
qplot(percent, rating, data=Ecuador, geom="boxplot") + ggtitle("Ecuador") #Guatemala
Guatemala <- filter(country, country_of_bean_origin=="Guatemala")
qplot(percent, rating, data=Guatemala, geom="boxplot") + ggtitle("Guatemala")#Madagascar
Madagascar <- filter(country, country_of_bean_origin=="Madagascar")
qplot(percent, rating, data=Madagascar, geom="boxplot") + ggtitle("Madagascar")#Mexico
Mexico <- filter(country, country_of_bean_origin=="Mexico")
qplot(percent, rating, data=Mexico, geom="boxplot") + ggtitle("Mexico")#Nicaragua
Nicaragua <- filter(country, country_of_bean_origin=="Nicaragua")
qplot(percent, rating, data=Nicaragua, geom="boxplot") + ggtitle("Nicaragua")#Papua New Guinea
PNG <- filter(country, country_of_bean_origin=="Papua New Guinea")
qplot(percent, rating, data=PNG, geom="boxplot") + ggtitle("Papua New Guinea")#Peru
Peru <- filter(country, country_of_bean_origin=="Peru")
qplot(percent, rating, data=Peru, geom="boxplot") + ggtitle("Peru")#Tanzania
Tanzania <- filter(country, country_of_bean_origin=="Tanzania")
qplot(percent, rating, data=Tanzania, geom="boxplot") + ggtitle("Tanzania")#Venezuela
Venezuela <- filter(country, country_of_bean_origin=="Venezuela")
qplot(percent, rating, data=Venezuela, geom="boxplot") + ggtitle("Venezuela")#Vietnam
Vietnam <- filter(country, country_of_bean_origin=="Vietnam")
qplot(percent, rating, data=Vietnam, geom="boxplot") + ggtitle("Vietnam")library(gapminder)
library(tidyverse)
library(dplyr)
library(here)
library(ggplot2)
chocolate <- readRDS(here("data", "chocolate.RDS"))
chocolate <- rename(chocolate, country=country_of_bean_origin)
#remove extra data and rows from gapminder dataset
gapminder_2 <- subset(gapminder, select= -c(year, lifeExp, pop, gdpPercap))
gapminder_2 <- distinct(gapminder_2, country, .keep_all=TRUE)
chocolate <- left_join(x=chocolate, y=gapminder_2, by="country")
#Americas
chocolate <- mutate(chocolate, continent = if_else((country == "Belize" | country == "St.Vincent-Grenadines" | country == "Grenada" | country == "St. Lucia" | country == "Martinique" | country == "U.S.A." | country == "Suriname" | country == "Tobago" | country == "Trinidad"), "Americas", continent))
#Asia
chocolate <- mutate(chocolate, continent = if_else((country == "Burma" | country == "Sulawesi" | country == "Sumatra"), "Asia", continent))
#Africa
chocolate <- mutate(chocolate, continent = if_else((country == "Congo" | country == "Ivory Coast" | country == "Principe" | country == "Sao Tome & Principe" | country == "Sao Tome" | country == "DR Congo"), "Africa", continent))
#Oceania
chocolate <- mutate(chocolate, continent = if_else((country == "Fiji" | country == "Papua New Guinea" | country == "Samoa" | country == "Solomon Islands" | country == "Vanuatu"), "Oceania", continent))
#Blend
chocolate <- mutate(chocolate, continent = if_else((country == "Blend"), "Blend", continent))
#check to ensure there are no NA's
chocolate %>% group_by(continent) %>% summarize(distinct_points = n_distinct(continent))## # A tibble: 5 × 2
## continent distinct_points
## <chr> <int>
## 1 Africa 1
## 2 Americas 1
## 3 Asia 1
## 4 Blend 1
## 5 Oceania 1
#drop rows for countries with fewer than 10 reviews, and Blends
chocolate <- group_by(chocolate, country) %>% mutate(n = n())
chocolate <- filter(chocolate, n>=10)
chocolate <- filter(chocolate, country!= "Blend")
#violin plot setup
Africa <- filter(chocolate, continent == "Africa")
Asia <- filter(chocolate, continent == "Asia")
Americas <- filter(chocolate, continent == "Americas")
Oceania <- filter(chocolate, continent == "Oceania")
#Violin plots
#Violin plot of chocolate ratings in African countries
ggplot(Africa, aes(continent, rating)) + geom_violin()#Violin plot of chocolate ratings in Asian countries
ggplot(Asia, aes(continent, rating)) + geom_violin()#Violin plot of chocolate ratings in countries in the Americas
ggplot(Americas, aes(continent, rating)) + geom_violin()#Violin plot of chocolate ratings in Oceanic countries
ggplot(Oceania, aes(continent, rating)) + geom_violin()library(tidyverse)
library(here)
chocolate <- readRDS(here("data", "chocolate.RDS"))
#add ingredient columns
chocolate <- mutate(chocolate, beans = case_when(grepl("B", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, sugar = case_when(grepl("S", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, cocoa_butter = case_when(grepl("C", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, vanilla = case_when(grepl("V", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, lecithin = case_when(grepl("L", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, salt = case_when(grepl("Sa", ingredients) ~ 1, .default = 0))
#add characteristic columns
chocolate <- mutate(chocolate, char_cocoa = case_when(grepl("cocoa", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_sweet = case_when(grepl("sweet", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_nutty = case_when(grepl("nutty", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_creamy = case_when(grepl("creamy", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_roasty = case_when(grepl("roasty", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_earthy = case_when(grepl("earthy", most_memorable_characteristics) ~ 1, .default = 0))
#group by year and calculate means
chocolate <- subset(chocolate, select=-c(ref, company_manufacturer, company_location, country_of_bean_origin, specific_bean_origin_or_bar_name, cocoa_percent, ingredients, most_memorable_characteristics, rating ))
chocolate <- group_by(chocolate, review_date)
chocolate <- summarize(chocolate,
beans = mean(beans, na.rm = TRUE),
sugar = mean(sugar, na.rm = TRUE),
cocoa_butter = mean(cocoa_butter, na.rm = TRUE),
vanilla = mean(vanilla, na.rm = TRUE),
lecithin = mean(lecithin, na.rm = TRUE),
salt = mean(salt, na.rm = TRUE),
char_cocoa = mean(char_cocoa, na.rm = TRUE),
char_sweet = mean(char_sweet, na.rm = TRUE),
char_nutty = mean(char_nutty, na.rm = TRUE),
char_creamy = mean(char_creamy, na.rm = TRUE),
char_roasty = mean(char_roasty, na.rm = TRUE),
char_earthy = mean(char_earthy, na.rm = TRUE))
chocolate_long <- chocolate %>% pivot_longer(-review_date, names_to = "features", values_to = "mean_score")
print(chocolate_long, n=192)## # A tibble: 192 × 3
## review_date features mean_score
## <dbl> <chr> <dbl>
## 1 2006 beans 0.968
## 2 2006 sugar 0.968
## 3 2006 cocoa_butter 0.903
## 4 2006 vanilla 0.694
## 5 2006 lecithin 0.694
## 6 2006 salt 0
## 7 2006 char_cocoa 0.210
## 8 2006 char_sweet 0.161
## 9 2006 char_nutty 0.0323
## 10 2006 char_creamy 0.242
## 11 2006 char_roasty 0.0484
## 12 2006 char_earthy 0.0645
## 13 2007 beans 0.945
## 14 2007 sugar 0.945
## 15 2007 cocoa_butter 0.767
## 16 2007 vanilla 0.548
## 17 2007 lecithin 0.384
## 18 2007 salt 0
## 19 2007 char_cocoa 0.342
## 20 2007 char_sweet 0.0959
## 21 2007 char_nutty 0.0411
## 22 2007 char_creamy 0.233
## 23 2007 char_roasty 0.0137
## 24 2007 char_earthy 0.0685
## 25 2008 beans 0.913
## 26 2008 sugar 0.902
## 27 2008 cocoa_butter 0.75
## 28 2008 vanilla 0.359
## 29 2008 lecithin 0.511
## 30 2008 salt 0
## 31 2008 char_cocoa 0.109
## 32 2008 char_sweet 0.130
## 33 2008 char_nutty 0.152
## 34 2008 char_creamy 0.0978
## 35 2008 char_roasty 0.0435
## 36 2008 char_earthy 0.0435
## 37 2009 beans 0.919
## 38 2009 sugar 0.919
## 39 2009 cocoa_butter 0.772
## 40 2009 vanilla 0.325
## 41 2009 lecithin 0.341
## 42 2009 salt 0
## 43 2009 char_cocoa 0.146
## 44 2009 char_sweet 0.154
## 45 2009 char_nutty 0.154
## 46 2009 char_creamy 0.0894
## 47 2009 char_roasty 0.0813
## 48 2009 char_earthy 0.0732
## 49 2010 beans 0.855
## 50 2010 sugar 0.855
## 51 2010 cocoa_butter 0.709
## 52 2010 vanilla 0.227
## 53 2010 lecithin 0.391
## 54 2010 salt 0.00909
## 55 2010 char_cocoa 0.218
## 56 2010 char_sweet 0.1
## 57 2010 char_nutty 0.145
## 58 2010 char_creamy 0.0909
## 59 2010 char_roasty 0.0364
## 60 2010 char_earthy 0.0727
## 61 2011 beans 0.939
## 62 2011 sugar 0.939
## 63 2011 cocoa_butter 0.693
## 64 2011 vanilla 0.160
## 65 2011 lecithin 0.160
## 66 2011 salt 0.0491
## 67 2011 char_cocoa 0.172
## 68 2011 char_sweet 0.110
## 69 2011 char_nutty 0.117
## 70 2011 char_creamy 0.129
## 71 2011 char_roasty 0.0736
## 72 2011 char_earthy 0.0613
## 73 2012 beans 0.928
## 74 2012 sugar 0.928
## 75 2012 cocoa_butter 0.675
## 76 2012 vanilla 0.186
## 77 2012 lecithin 0.124
## 78 2012 salt 0.0722
## 79 2012 char_cocoa 0.0876
## 80 2012 char_sweet 0.139
## 81 2012 char_nutty 0.103
## 82 2012 char_creamy 0.0722
## 83 2012 char_roasty 0.0619
## 84 2012 char_earthy 0.0464
## 85 2013 beans 0.967
## 86 2013 sugar 0.956
## 87 2013 cocoa_butter 0.776
## 88 2013 vanilla 0.208
## 89 2013 lecithin 0.295
## 90 2013 salt 0.0164
## 91 2013 char_cocoa 0.175
## 92 2013 char_sweet 0.126
## 93 2013 char_nutty 0.115
## 94 2013 char_creamy 0.0710
## 95 2013 char_roasty 0.109
## 96 2013 char_earthy 0.0492
## 97 2014 beans 0.984
## 98 2014 sugar 0.984
## 99 2014 cocoa_butter 0.644
## 100 2014 vanilla 0.0688
## 101 2014 lecithin 0.121
## 102 2014 salt 0.0324
## 103 2014 char_cocoa 0.0607
## 104 2014 char_sweet 0.0972
## 105 2014 char_nutty 0.158
## 106 2014 char_creamy 0.0486
## 107 2014 char_roasty 0.0972
## 108 2014 char_earthy 0.101
## 109 2015 beans 0.986
## 110 2015 sugar 0.979
## 111 2015 cocoa_butter 0.546
## 112 2015 vanilla 0.0599
## 113 2015 lecithin 0.120
## 114 2015 salt 0
## 115 2015 char_cocoa 0.127
## 116 2015 char_sweet 0.106
## 117 2015 char_nutty 0.109
## 118 2015 char_creamy 0.0423
## 119 2015 char_roasty 0.123
## 120 2015 char_earthy 0.0810
## 121 2016 beans 0.982
## 122 2016 sugar 0.977
## 123 2016 cocoa_butter 0.594
## 124 2016 vanilla 0.0507
## 125 2016 lecithin 0.106
## 126 2016 salt 0.00922
## 127 2016 char_cocoa 0.0922
## 128 2016 char_sweet 0.171
## 129 2016 char_nutty 0.157
## 130 2016 char_creamy 0.0553
## 131 2016 char_roasty 0.101
## 132 2016 char_earthy 0.111
## 133 2017 beans 0.981
## 134 2017 sugar 0.981
## 135 2017 cocoa_butter 0.562
## 136 2017 vanilla 0.0286
## 137 2017 lecithin 0.133
## 138 2017 salt 0.00952
## 139 2017 char_cocoa 0.133
## 140 2017 char_sweet 0.0952
## 141 2017 char_nutty 0.0667
## 142 2017 char_creamy 0.0952
## 143 2017 char_roasty 0.124
## 144 2017 char_earthy 0.124
## 145 2018 beans 0.987
## 146 2018 sugar 0.987
## 147 2018 cocoa_butter 0.596
## 148 2018 vanilla 0.0614
## 149 2018 lecithin 0.132
## 150 2018 salt 0
## 151 2018 char_cocoa 0.180
## 152 2018 char_sweet 0.118
## 153 2018 char_nutty 0.0789
## 154 2018 char_creamy 0.0439
## 155 2018 char_roasty 0.110
## 156 2018 char_earthy 0.123
## 157 2019 beans 1
## 158 2019 sugar 1
## 159 2019 cocoa_butter 0.679
## 160 2019 vanilla 0.0259
## 161 2019 lecithin 0.202
## 162 2019 salt 0
## 163 2019 char_cocoa 0.259
## 164 2019 char_sweet 0.145
## 165 2019 char_nutty 0.0725
## 166 2019 char_creamy 0.0881
## 167 2019 char_roasty 0.109
## 168 2019 char_earthy 0.0415
## 169 2020 beans 1
## 170 2020 sugar 1
## 171 2020 cocoa_butter 0.568
## 172 2020 vanilla 0.0370
## 173 2020 lecithin 0.0247
## 174 2020 salt 0
## 175 2020 char_cocoa 0.284
## 176 2020 char_sweet 0.160
## 177 2020 char_nutty 0.0494
## 178 2020 char_creamy 0.0370
## 179 2020 char_roasty 0.0988
## 180 2020 char_earthy 0.0988
## 181 2021 beans 1
## 182 2021 sugar 0.994
## 183 2021 cocoa_butter 0.646
## 184 2021 vanilla 0.0114
## 185 2021 lecithin 0.0800
## 186 2021 salt 0
## 187 2021 char_cocoa 0.297
## 188 2021 char_sweet 0.126
## 189 2021 char_nutty 0.0971
## 190 2021 char_creamy 0.0171
## 191 2021 char_roasty 0.0743
## 192 2021 char_earthy 0.0686
Long dataset of mean scores for each ingredient and main characteristic by review year
library(tidyverse)
library(here)
library(ggplot2)
#rename flavors so they're more descriptive
chocolate_long <- mutate(chocolate_long, features = str_remove(features, "char_"))
chocolate_long <- mutate(chocolate_long, features = if_else(features == "cocoa_butter", "cocoa butter", features))
#reorder features, ingredients then flavours"
chocolate_long <- chocolate_long %>% mutate(features = fct_relevel(features, c("cocoa butter", "lecithin", "sugar", "salt", "vanilla"), after=1))
#create plot
chocolate_long %>% ggplot(aes(review_date, mean_score)) + geom_point(aes(color = features)) + geom_smooth(aes(group=features, color = features), se = FALSE) + labs(title = "Trends in average chocolate bar ratings by ingredients and\nkey characteristics from 2006 to 2021", caption = "Meriam Berka") + xlab("Year") + ylab("Average rating") + scale_x_continuous(breaks = c(2008, 2012, 2016, 2020), minor_breaks = c(2006, 2010, 2014, 2018, 2022)) + theme_gray()library(tidyverse)
library(here)
library(ggplot2)
chocolate_long %>% ggplot(aes(review_date, mean_score)) + geom_point(aes(color = review_date)) + geom_smooth(color = "burlywood4") + labs(title = "Ave chocolate bar ratings by ingredients and key characteristics from 2006 to 2021", caption = "Meriam Berka doesn't endorse this") + xlab("date123%") + ylab(NULL) + scale_x_continuous(breaks = c(2006, 2017, 2018, 2021)) + theme_dark()library(tidyverse)
library(here)
library(ggplot2)
chocolate <- readRDS(here("data", "chocolate.RDS"))
chocolate %>%
ggplot(aes(x = as.factor(review_date), y = rating)) + geom_violin(draw_quantiles = c(0.5), color = "#FF99FF", fill = "#FFCCFF") + ggtitle("Distribution of chocolate bar ratings from 0 to 4 by year of\nreview, 2006-2021") + labs(caption = "Data from Tidy Tuesday 01-11-2022:\nR4DS Online Learning Community (2023). Tidy Tuesday: A weekly social data project.\nhttps://github.com/rfordatascience/tidytuesday.')") + xlab("Year of review") + ylab("Rating (0 to 4)") + theme(plot.title = element_text(margin = margin( 0.01, 0, 20, 0), family = "serif", face = "bold"), axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1, color = "white"), axis.text.y = element_text(color = "white"), axis.title.x = element_text(margin = margin(12, 0, 25, 0)), axis.title.y = element_text(margin = margin(10, 10, 10, 5)), plot.caption = element_text(hjust = 0, color = "white"), panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "lavender"), panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "lavender"), text = element_text(color = "white"), plot.background = element_rect(fill = "gray7"), panel.background = element_rect(fill="gray60"), plot.margin = unit(c(0.5, 0.75, 0.5, 0.75), "inches"))## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#1. Histogram of ratings
library(tidyverse)
library(here)
chocolate <- readRDS(here("data", "chocolate.RDS"))
ggplot(chocolate, aes(x=rating)) + geom_histogram(bins=15) + ggtitle("Histogram of chocolate bar ratings")Increasing the number of bins allows more granularity of data, better showing the shape of the distribution. However, if we increase the number of bins too much we start to have gaps in our data because the number of bins exceeds the number of potential scores. I selected a 15 bin histogram because it nicely shows the shape of the distribution but doesn’t have the gapping a 20+ bin histogram does.
#2. Number of ratings by country of bean origin
library(dplyr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
count <-count(chocolate, country_of_bean_origin, wt=NULL, sort=TRUE)
print(count, n=62)## # A tibble: 62 × 2
## country_of_bean_origin n
## <chr> <int>
## 1 Venezuela 253
## 2 Peru 244
## 3 Dominican Republic 226
## 4 Ecuador 219
## 5 Madagascar 177
## 6 Blend 156
## 7 Nicaragua 100
## 8 Bolivia 80
## 9 Colombia 79
## 10 Tanzania 79
## 11 Brazil 78
## 12 Belize 76
## 13 Vietnam 73
## 14 Guatemala 62
## 15 Mexico 55
## 16 Papua New Guinea 50
## 17 Costa Rica 43
## 18 Trinidad 42
## 19 Ghana 41
## 20 India 35
## 21 U.S.A. 33
## 22 Haiti 30
## 23 Honduras 25
## 24 Jamaica 24
## 25 Philippines 24
## 26 Indonesia 20
## 27 Grenada 19
## 28 Uganda 19
## 29 Fiji 16
## 30 Sao Tome 14
## 31 Vanuatu 13
## 32 Cuba 12
## 33 Congo 11
## 34 Solomon Islands 10
## 35 St. Lucia 10
## 36 Panama 9
## 37 Malaysia 8
## 38 Ivory Coast 7
## 39 Puerto Rico 7
## 40 El Salvador 6
## 41 Thailand 5
## 42 Sierra Leone 4
## 43 Australia 3
## 44 Cameroon 3
## 45 Liberia 3
## 46 Nigeria 3
## 47 Samoa 3
## 48 Togo 3
## 49 Sao Tome & Principe 2
## 50 Sri Lanka 2
## 51 Taiwan 2
## 52 Tobago 2
## 53 Burma 1
## 54 China 1
## 55 DR Congo 1
## 56 Gabon 1
## 57 Martinique 1
## 58 Principe 1
## 59 St.Vincent-Grenadines 1
## 60 Sulawesi 1
## 61 Sumatra 1
## 62 Suriname 1
The number of ratings by each country of origin is described in the table above
#3. Average ratings for Ecuadorian beans
library(dplyr)
library(knitr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
ecud <- filter(chocolate,country_of_bean_origin == "Ecuador")
ecud <- mutate(ecud, mean=mean(rating, na.rm = TRUE))
ecud <- mutate(ecud, sd=sd(rating, na.rm = TRUE))
ecud <- mutate(ecud, present=case_when(!is.na(rating) ~ 1 , is.na(rating) ~ 0 ))
ecud <- mutate(ecud, total=sum(present))
ecud <- select(ecud, (mean:total))
ecud <- select(ecud, -(present))
ecud <- filter(ecud, !row_number()!= 1 )
knitr::kable(head(ecud[, 1:3]), "pipe")| mean | sd | total |
|---|---|---|
| 3.164384 | 0.5122678 | 219 |
Average ratings, standard deviation of ratings and total number of ratings for Ecuadorian chocolates
#4.Best manufacturer
library(dplyr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
ecud <- filter(chocolate,country_of_bean_origin == "Ecuador")
ecud <- mutate(ecud, mean=mean(rating, na.rm = TRUE))
ecud <- mutate(ecud, sd=sd(rating, na.rm = TRUE))
ecud <- mutate(ecud, present=case_when(!is.na(rating) ~ 1 , is.na(rating) ~ 0 ))
ecud <- mutate(ecud, total=sum(present))
company <- group_by(ecud, company_manufacturer)
summarize(company, rating=mean(rating, na.rm = TRUE))## # A tibble: 136 × 2
## company_manufacturer rating
## <chr> <dbl>
## 1 A. Morin 3.75
## 2 Aequare (Gianduja) 2.88
## 3 Alexandre 3.5
## 4 Altus aka Cao Artisan 2.75
## 5 Amano 4
## 6 Amatller (Simon Coll) 2.75
## 7 Amedei 3
## 8 Ara 2.75
## 9 Arete 3.44
## 10 Askinosie 3
## # ℹ 126 more rows
Of the Ecuadorian bean chococlates, A. Morin is the manufacturer with the highest average rating.
#5.Average rating by country of origin
chocolate <- readRDS(here("data", "chocolate.RDS"))
country <- group_by(chocolate, country_of_bean_origin)
country <- summarize(country, rating=mean(rating, na.rm = TRUE))
country <- arrange(country, desc(rating))
knitr::kable(head(country[, 1:2]), "pipe")| country_of_bean_origin | rating |
|---|---|
| Tobago | 3.625000 |
| China | 3.500000 |
| Sao Tome & Principe | 3.500000 |
| Solomon Islands | 3.450000 |
| Congo | 3.318182 |
| Thailand | 3.300000 |
Tobago (3.63), China (3.50), and Sao Tome & Principe (3.50) are the countries of origin that have the highest average ratings.
#6.Average rating by country of origin in countries with 10+ ratings
library(dplyr)
chocolate <- readRDS(here("data", "chocolate.RDS"))
country <- group_by(chocolate, country_of_bean_origin)
country_n <- mutate(country, n=n())
country_n <- filter(country_n, n>=10)
country_n <- summarize(country_n, rating=mean(rating, na.rm = TRUE))
country_n <- arrange(country_n, desc(rating))
knitr::kable(head(country_n[, 1:2]), "pipe")| country_of_bean_origin | rating |
|---|---|
| Solomon Islands | 3.450000 |
| Congo | 3.318182 |
| Cuba | 3.291667 |
| Vietnam | 3.287671 |
| Papua New Guinea | 3.280000 |
| Madagascar | 3.266949 |
Solomon Islands (3.45), Congo (3.32), and Cuba (3.29) are the countries of origin that have the highest average rating after filtering out countries with fewer than 10 chococlate bar reviews.
#7.
library(dplyr)
library(forcats)
chocolate <- readRDS(here("data", "chocolate.RDS"))
country <- group_by(chocolate, country_of_bean_origin)
country <- mutate(country, n=n())
country <- filter(country, n>=50)
country <- mutate(country, c_percent = str_remove(cocoa_percent, "%"))
country$c_percent <-as.numeric(country$c_percent)
country <- mutate(country, percent = case_when(c_percent < 60 ~ "1", c_percent >= 60 & c_percent < 70 ~ "2", c_percent >= 70 & c_percent < 90 ~ "3", c_percent >= 90 ~ "4"))
country <- country %>% mutate(percent = fct_relevel(percent, c("2", "3", "4"), after=1))
#Belize
Belize <- filter(country, country_of_bean_origin=="Belize")
qplot(percent, rating, data=Belize, geom="boxplot") + ggtitle("Belize")#Blend
Blend <- filter(country, country_of_bean_origin=="Blend")
qplot(percent, rating, data=Blend, geom="boxplot") + ggtitle("Blend")#Bolivia
Bolivia <- filter(country, country_of_bean_origin=="Bolivia")
qplot(percent, rating, data=Bolivia, geom="boxplot") + ggtitle("Bolivia")#Brazil
Brazil <- filter(country, country_of_bean_origin=="Brazil")
qplot(percent, rating, data=Brazil, geom="boxplot") + ggtitle("Brazil") #Colombia
Colombia <- filter(country, country_of_bean_origin=="Colombia")
qplot(percent, rating, data=Colombia, geom="boxplot") + ggtitle("Colombia") #Dominican Republic
DR <- filter(country, country_of_bean_origin=="Dominican Republic")
qplot(percent, rating, data=DR, geom="boxplot") + ggtitle("Dominican Republic")#Ecuador
Ecuador <- filter(country, country_of_bean_origin=="Ecuador")
qplot(percent, rating, data=Ecuador, geom="boxplot") + ggtitle("Ecuador") #Guatemala
Guatemala <- filter(country, country_of_bean_origin=="Guatemala")
qplot(percent, rating, data=Guatemala, geom="boxplot") + ggtitle("Guatemala")#Madagascar
Madagascar <- filter(country, country_of_bean_origin=="Madagascar")
qplot(percent, rating, data=Madagascar, geom="boxplot") + ggtitle("Madagascar")#Mexico
Mexico <- filter(country, country_of_bean_origin=="Mexico")
qplot(percent, rating, data=Mexico, geom="boxplot") + ggtitle("Mexico")#Nicaragua
Nicaragua <- filter(country, country_of_bean_origin=="Nicaragua")
qplot(percent, rating, data=Nicaragua, geom="boxplot") + ggtitle("Nicaragua")#Papua New Guinea
PNG <- filter(country, country_of_bean_origin=="Papua New Guinea")
qplot(percent, rating, data=PNG, geom="boxplot") + ggtitle("Papua New Guinea")#Peru
Peru <- filter(country, country_of_bean_origin=="Peru")
qplot(percent, rating, data=Peru, geom="boxplot") + ggtitle("Peru")#Tanzania
Tanzania <- filter(country, country_of_bean_origin=="Tanzania")
qplot(percent, rating, data=Tanzania, geom="boxplot") + ggtitle("Tanzania")#Venezuela
Venezuela <- filter(country, country_of_bean_origin=="Venezuela")
qplot(percent, rating, data=Venezuela, geom="boxplot") + ggtitle("Venezuela")#Vietnam
Vietnam <- filter(country, country_of_bean_origin=="Vietnam")
qplot(percent, rating, data=Vietnam, geom="boxplot") + ggtitle("Vietnam")library(gapminder)
library(tidyverse)
library(dplyr)
library(here)
library(ggplot2)
chocolate <- readRDS(here("data", "chocolate.RDS"))
chocolate <- rename(chocolate, country=country_of_bean_origin)
#remove extra data and rows from gapminder dataset
gapminder_2 <- subset(gapminder, select= -c(year, lifeExp, pop, gdpPercap))
gapminder_2 <- distinct(gapminder_2, country, .keep_all=TRUE)
chocolate <- left_join(x=chocolate, y=gapminder_2, by="country")
#Americas
chocolate <- mutate(chocolate, continent = if_else((country == "Belize" | country == "St.Vincent-Grenadines" | country == "Grenada" | country == "St. Lucia" | country == "Martinique" | country == "U.S.A." | country == "Suriname" | country == "Tobago" | country == "Trinidad"), "Americas", continent))
#Asia
chocolate <- mutate(chocolate, continent = if_else((country == "Burma" | country == "Sulawesi" | country == "Sumatra"), "Asia", continent))
#Africa
chocolate <- mutate(chocolate, continent = if_else((country == "Congo" | country == "Ivory Coast" | country == "Principe" | country == "Sao Tome & Principe" | country == "Sao Tome" | country == "DR Congo"), "Africa", continent))
#Oceania
chocolate <- mutate(chocolate, continent = if_else((country == "Fiji" | country == "Papua New Guinea" | country == "Samoa" | country == "Solomon Islands" | country == "Vanuatu"), "Oceania", continent))
#Blend
chocolate <- mutate(chocolate, continent = if_else((country == "Blend"), "Blend", continent))
#check to ensure there are no NA's
chocolate %>% group_by(continent) %>% summarize(distinct_points = n_distinct(continent))## # A tibble: 5 × 2
## continent distinct_points
## <chr> <int>
## 1 Africa 1
## 2 Americas 1
## 3 Asia 1
## 4 Blend 1
## 5 Oceania 1
#drop rows for countries with fewer than 10 reviews, and Blends
chocolate <- group_by(chocolate, country) %>% mutate(n = n())
chocolate <- filter(chocolate, n>=10)
chocolate <- filter(chocolate, country!= "Blend")
#violin plot setup
Africa <- filter(chocolate, continent == "Africa")
Asia <- filter(chocolate, continent == "Asia")
Americas <- filter(chocolate, continent == "Americas")
Oceania <- filter(chocolate, continent == "Oceania")
#Violin plots
#Violin plot of chocolate ratings in African countries
ggplot(Africa, aes(continent, rating)) + geom_violin()#Violin plot of chocolate ratings in Asian countries
ggplot(Asia, aes(continent, rating)) + geom_violin()#Violin plot of chocolate ratings in countries in the Americas
ggplot(Americas, aes(continent, rating)) + geom_violin()#Violin plot of chocolate ratings in Oceanic countries
ggplot(Oceania, aes(continent, rating)) + geom_violin()library(tidyverse)
library(here)
chocolate <- readRDS(here("data", "chocolate.RDS"))
#add ingredient columns
chocolate <- mutate(chocolate, beans = case_when(grepl("B", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, sugar = case_when(grepl("S", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, cocoa_butter = case_when(grepl("C", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, vanilla = case_when(grepl("V", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, lecithin = case_when(grepl("L", ingredients) ~ 1, .default = 0))
chocolate <- mutate(chocolate, salt = case_when(grepl("Sa", ingredients) ~ 1, .default = 0))
#add characteristic columns
chocolate <- mutate(chocolate, char_cocoa = case_when(grepl("cocoa", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_sweet = case_when(grepl("sweet", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_nutty = case_when(grepl("nutty", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_creamy = case_when(grepl("creamy", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_roasty = case_when(grepl("roasty", most_memorable_characteristics) ~ 1, .default = 0))
chocolate <- mutate(chocolate, char_earthy = case_when(grepl("earthy", most_memorable_characteristics) ~ 1, .default = 0))
#group by year and calculate means
chocolate <- subset(chocolate, select=-c(ref, company_manufacturer, company_location, country_of_bean_origin, specific_bean_origin_or_bar_name, cocoa_percent, ingredients, most_memorable_characteristics, rating ))
chocolate <- group_by(chocolate, review_date)
chocolate <- summarize(chocolate,
beans = mean(beans, na.rm = TRUE),
sugar = mean(sugar, na.rm = TRUE),
cocoa_butter = mean(cocoa_butter, na.rm = TRUE),
vanilla = mean(vanilla, na.rm = TRUE),
lecithin = mean(lecithin, na.rm = TRUE),
salt = mean(salt, na.rm = TRUE),
char_cocoa = mean(char_cocoa, na.rm = TRUE),
char_sweet = mean(char_sweet, na.rm = TRUE),
char_nutty = mean(char_nutty, na.rm = TRUE),
char_creamy = mean(char_creamy, na.rm = TRUE),
char_roasty = mean(char_roasty, na.rm = TRUE),
char_earthy = mean(char_earthy, na.rm = TRUE))
chocolate_long <- chocolate %>% pivot_longer(-review_date, names_to = "features", values_to = "mean_score")
print(chocolate_long, n=192)## # A tibble: 192 × 3
## review_date features mean_score
## <dbl> <chr> <dbl>
## 1 2006 beans 0.968
## 2 2006 sugar 0.968
## 3 2006 cocoa_butter 0.903
## 4 2006 vanilla 0.694
## 5 2006 lecithin 0.694
## 6 2006 salt 0
## 7 2006 char_cocoa 0.210
## 8 2006 char_sweet 0.161
## 9 2006 char_nutty 0.0323
## 10 2006 char_creamy 0.242
## 11 2006 char_roasty 0.0484
## 12 2006 char_earthy 0.0645
## 13 2007 beans 0.945
## 14 2007 sugar 0.945
## 15 2007 cocoa_butter 0.767
## 16 2007 vanilla 0.548
## 17 2007 lecithin 0.384
## 18 2007 salt 0
## 19 2007 char_cocoa 0.342
## 20 2007 char_sweet 0.0959
## 21 2007 char_nutty 0.0411
## 22 2007 char_creamy 0.233
## 23 2007 char_roasty 0.0137
## 24 2007 char_earthy 0.0685
## 25 2008 beans 0.913
## 26 2008 sugar 0.902
## 27 2008 cocoa_butter 0.75
## 28 2008 vanilla 0.359
## 29 2008 lecithin 0.511
## 30 2008 salt 0
## 31 2008 char_cocoa 0.109
## 32 2008 char_sweet 0.130
## 33 2008 char_nutty 0.152
## 34 2008 char_creamy 0.0978
## 35 2008 char_roasty 0.0435
## 36 2008 char_earthy 0.0435
## 37 2009 beans 0.919
## 38 2009 sugar 0.919
## 39 2009 cocoa_butter 0.772
## 40 2009 vanilla 0.325
## 41 2009 lecithin 0.341
## 42 2009 salt 0
## 43 2009 char_cocoa 0.146
## 44 2009 char_sweet 0.154
## 45 2009 char_nutty 0.154
## 46 2009 char_creamy 0.0894
## 47 2009 char_roasty 0.0813
## 48 2009 char_earthy 0.0732
## 49 2010 beans 0.855
## 50 2010 sugar 0.855
## 51 2010 cocoa_butter 0.709
## 52 2010 vanilla 0.227
## 53 2010 lecithin 0.391
## 54 2010 salt 0.00909
## 55 2010 char_cocoa 0.218
## 56 2010 char_sweet 0.1
## 57 2010 char_nutty 0.145
## 58 2010 char_creamy 0.0909
## 59 2010 char_roasty 0.0364
## 60 2010 char_earthy 0.0727
## 61 2011 beans 0.939
## 62 2011 sugar 0.939
## 63 2011 cocoa_butter 0.693
## 64 2011 vanilla 0.160
## 65 2011 lecithin 0.160
## 66 2011 salt 0.0491
## 67 2011 char_cocoa 0.172
## 68 2011 char_sweet 0.110
## 69 2011 char_nutty 0.117
## 70 2011 char_creamy 0.129
## 71 2011 char_roasty 0.0736
## 72 2011 char_earthy 0.0613
## 73 2012 beans 0.928
## 74 2012 sugar 0.928
## 75 2012 cocoa_butter 0.675
## 76 2012 vanilla 0.186
## 77 2012 lecithin 0.124
## 78 2012 salt 0.0722
## 79 2012 char_cocoa 0.0876
## 80 2012 char_sweet 0.139
## 81 2012 char_nutty 0.103
## 82 2012 char_creamy 0.0722
## 83 2012 char_roasty 0.0619
## 84 2012 char_earthy 0.0464
## 85 2013 beans 0.967
## 86 2013 sugar 0.956
## 87 2013 cocoa_butter 0.776
## 88 2013 vanilla 0.208
## 89 2013 lecithin 0.295
## 90 2013 salt 0.0164
## 91 2013 char_cocoa 0.175
## 92 2013 char_sweet 0.126
## 93 2013 char_nutty 0.115
## 94 2013 char_creamy 0.0710
## 95 2013 char_roasty 0.109
## 96 2013 char_earthy 0.0492
## 97 2014 beans 0.984
## 98 2014 sugar 0.984
## 99 2014 cocoa_butter 0.644
## 100 2014 vanilla 0.0688
## 101 2014 lecithin 0.121
## 102 2014 salt 0.0324
## 103 2014 char_cocoa 0.0607
## 104 2014 char_sweet 0.0972
## 105 2014 char_nutty 0.158
## 106 2014 char_creamy 0.0486
## 107 2014 char_roasty 0.0972
## 108 2014 char_earthy 0.101
## 109 2015 beans 0.986
## 110 2015 sugar 0.979
## 111 2015 cocoa_butter 0.546
## 112 2015 vanilla 0.0599
## 113 2015 lecithin 0.120
## 114 2015 salt 0
## 115 2015 char_cocoa 0.127
## 116 2015 char_sweet 0.106
## 117 2015 char_nutty 0.109
## 118 2015 char_creamy 0.0423
## 119 2015 char_roasty 0.123
## 120 2015 char_earthy 0.0810
## 121 2016 beans 0.982
## 122 2016 sugar 0.977
## 123 2016 cocoa_butter 0.594
## 124 2016 vanilla 0.0507
## 125 2016 lecithin 0.106
## 126 2016 salt 0.00922
## 127 2016 char_cocoa 0.0922
## 128 2016 char_sweet 0.171
## 129 2016 char_nutty 0.157
## 130 2016 char_creamy 0.0553
## 131 2016 char_roasty 0.101
## 132 2016 char_earthy 0.111
## 133 2017 beans 0.981
## 134 2017 sugar 0.981
## 135 2017 cocoa_butter 0.562
## 136 2017 vanilla 0.0286
## 137 2017 lecithin 0.133
## 138 2017 salt 0.00952
## 139 2017 char_cocoa 0.133
## 140 2017 char_sweet 0.0952
## 141 2017 char_nutty 0.0667
## 142 2017 char_creamy 0.0952
## 143 2017 char_roasty 0.124
## 144 2017 char_earthy 0.124
## 145 2018 beans 0.987
## 146 2018 sugar 0.987
## 147 2018 cocoa_butter 0.596
## 148 2018 vanilla 0.0614
## 149 2018 lecithin 0.132
## 150 2018 salt 0
## 151 2018 char_cocoa 0.180
## 152 2018 char_sweet 0.118
## 153 2018 char_nutty 0.0789
## 154 2018 char_creamy 0.0439
## 155 2018 char_roasty 0.110
## 156 2018 char_earthy 0.123
## 157 2019 beans 1
## 158 2019 sugar 1
## 159 2019 cocoa_butter 0.679
## 160 2019 vanilla 0.0259
## 161 2019 lecithin 0.202
## 162 2019 salt 0
## 163 2019 char_cocoa 0.259
## 164 2019 char_sweet 0.145
## 165 2019 char_nutty 0.0725
## 166 2019 char_creamy 0.0881
## 167 2019 char_roasty 0.109
## 168 2019 char_earthy 0.0415
## 169 2020 beans 1
## 170 2020 sugar 1
## 171 2020 cocoa_butter 0.568
## 172 2020 vanilla 0.0370
## 173 2020 lecithin 0.0247
## 174 2020 salt 0
## 175 2020 char_cocoa 0.284
## 176 2020 char_sweet 0.160
## 177 2020 char_nutty 0.0494
## 178 2020 char_creamy 0.0370
## 179 2020 char_roasty 0.0988
## 180 2020 char_earthy 0.0988
## 181 2021 beans 1
## 182 2021 sugar 0.994
## 183 2021 cocoa_butter 0.646
## 184 2021 vanilla 0.0114
## 185 2021 lecithin 0.0800
## 186 2021 salt 0
## 187 2021 char_cocoa 0.297
## 188 2021 char_sweet 0.126
## 189 2021 char_nutty 0.0971
## 190 2021 char_creamy 0.0171
## 191 2021 char_roasty 0.0743
## 192 2021 char_earthy 0.0686
Long dataset of mean scores for each ingredient and main characteristic by review year
library(tidyverse)
library(here)
library(ggplot2)
#rename flavors so they're more descriptive
chocolate_long <- mutate(chocolate_long, features = str_remove(features, "char_"))
chocolate_long <- mutate(chocolate_long, features = if_else(features == "cocoa_butter", "cocoa butter", features))
#reorder features, ingredients then flavours"
chocolate_long <- chocolate_long %>% mutate(features = fct_relevel(features, c("cocoa butter", "lecithin", "sugar", "salt", "vanilla"), after=1))
#create plot
chocolate_long %>% ggplot(aes(review_date, mean_score)) + geom_point(aes(color = features)) + geom_smooth(aes(group=features, color = features), se = FALSE) + labs(title = "Trends in average chocolate bar ratings by ingredients and\nkey characteristics from 2006 to 2021", caption = "Meriam Berka") + xlab("Year") + ylab("Average rating") + scale_x_continuous(breaks = c(2008, 2012, 2016, 2020), minor_breaks = c(2006, 2010, 2014, 2018, 2022)) + theme_gray()library(tidyverse)
library(here)
library(ggplot2)
chocolate_long %>% ggplot(aes(review_date, mean_score)) + geom_point(aes(color = review_date)) + geom_smooth(color = "burlywood4") + labs(title = "Ave chocolate bar ratings by ingredients and key characteristics from 2006 to 2021", caption = "Meriam Berka doesn't endorse this") + xlab("date123%") + ylab(NULL) + scale_x_continuous(breaks = c(2006, 2017, 2018, 2021)) + theme_dark()library(tidyverse)
library(here)
library(ggplot2)
chocolate <- readRDS(here("data", "chocolate.RDS"))
chocolate %>%
ggplot(aes(x = as.factor(review_date), y = rating)) + geom_violin(draw_quantiles = c(0.5), color = "#FF99FF", fill = "#FFCCFF") + ggtitle("Distribution of chocolate bar ratings from 0 to 4 by year of\nreview, 2006-2021") + labs(caption = "Data from Tidy Tuesday 01-11-2022:\nR4DS Online Learning Community (2023). Tidy Tuesday: A weekly social data project.\nhttps://github.com/rfordatascience/tidytuesday.')") + xlab("Year of review") + ylab("Rating (0 to 4)") + theme(plot.title = element_text(margin = margin( 0.01, 0, 20, 0), family = "serif", face = "bold"), axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1, color = "white"), axis.text.y = element_text(color = "white"), axis.title.x = element_text(margin = margin(12, 0, 25, 0)), axis.title.y = element_text(margin = margin(10, 10, 10, 5)), plot.caption = element_text(hjust = 0, color = "white"), panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "lavender"), panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "lavender"), text = element_text(color = "white"), plot.background = element_rect(fill = "gray7"), panel.background = element_rect(fill="gray60"), plot.margin = unit(c(0.5, 0.75, 0.5, 0.75), "inches"))## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.